Our goal in this exercise is to BEGIN coming to a common agreement, among this class, as to what terms we will use as we selectively refine our corpus-wide vocabulary. This corpus vocabulary is what would represent the content of each different document for clustering and classification purposes, which will be our next step. This means that we need to make decisions - what is in, what is out.
import pandas as pd
import os
import numpy as np
import re
import string
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
import random
from dataclasses import dataclass
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import PorterStemmer
import gensim
from gensim.models import Word2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.manifold import TSNE
import scipy.cluster.hierarchy
from IPython.display import display, HTML
from typing import List, Callable, Dict
from google.colab import drive
drive.mount('/content/gdrive/')
Mounted at /content/gdrive/
# Only run this once, they will be downloaded.
nltk.download('stopwords',quiet=True)
nltk.download('wordnet',quiet=True)
nltk.download('punkt',quiet=True)
nltk.download('omw-1.4',quiet=True)
True
import pkg_resources
pkg_resources.require("gensim<=3.8.3");
print("Genism Version: ", gensim.__version__)
Genism Version: 3.6.0
def warn(*args, **kwargs):
pass
import warnings
warnings.warn = warn
def add_movie_descriptor(data: pd.DataFrame, corpus_df: pd.DataFrame):
"""
Adds "Movie Description" to the supplied dataframe, in the form {Genre}_{P|N}_{Movie Title}_{DocID}
"""
review = np.where(corpus_df['Review Type (pos or neg)'] == 'Positive', 'P', 'N')
data['Descriptor'] = corpus_df['Genre of Movie'] + '_' + corpus_df['Movie Title'] + '_' + review + '_' + corpus_df['Doc_ID'].astype(str)
def get_corpus_df(path):
data = pd.read_csv(path, encoding="utf-8")
add_movie_descriptor(data, data)
sorted_data = data.sort_values(['Descriptor'])
indexed_data = sorted_data.set_index(['Doc_ID'])
indexed_data['Doc_ID'] = indexed_data.index
return indexed_data
def remove_punctuation(text):
return re.sub('[^a-zA-Z]', ' ', str(text))
def lower_case(text):
return text.lower()
def remove_tags(text):
return re.sub("</?.*?>"," <> ", text)
def remove_special_chars_and_digits(text):
return re.sub("(\\d|\\W)+"," ", text)
@dataclass
class Document:
doc_id: str
text: str
def normalize_document(document: Document) -> Document:
text = document.text
text = remove_punctuation(text)
text = lower_case(text)
text = remove_tags(text)
text = remove_special_chars_and_digits(text)
return Document(document.doc_id, text)
def normalize_documents(documents: List[Document]) -> List[Document]:
"""
Normalizes text for all given documents.
Removes punctuation, converts to lower case, removes tags and special characters.
"""
return [normalize_document(x) for x in documents]
@dataclass
class TokenizedDocument:
doc_id: str
tokens: List[str]
def tokenize_document(document: Document) -> TokenizedDocument:
tokens = nltk.word_tokenize(document.text)
return TokenizedDocument(document.doc_id, tokens)
def tokenize_documents(documents: List[Document]) -> List[TokenizedDocument]:
return [tokenize_document(x) for x in documents]
# def lemmatize(documents: List[TokenizedDocument]) -> List[TokenizedDocument]:
# result = []
# lemmatizer = WordNetLemmatizer()
# for document in documents:
# output_tokens = [lemmatizer.lemmatize(w) for w in document.tokens]
# result.append(TokenizedDocument(document.doc_id, output_tokens))
# return result
# def stem(documents: List[TokenizedDocument]) -> List[TokenizedDocument]:
# result = []
# stemmer = PorterStemmer()
# for document in documents:
# output_tokens = [stemmer.stem(w) for w in document.tokens]
# result.append(TokenizedDocument(document.doc_id, output_tokens))
# return result
# def remove_stop_words(documents: List[TokenizedDocument]) -> List[TokenizedDocument]:
# result = []
# stop_words = set(nltk.corpus.stopwords.words('english'))
# for document in documents:
# filtered_tokens = [w for w in document.tokens if not w in stop_words]
# result.append(TokenizedDocument(document.doc_id, filtered_tokens))
# return result
def add_flags(data: pd.DataFrame, us_doc_ids: List[int], horror_doc_ids: List[int]):
data['is_us'] = data.index.isin(us_doc_ids)
data['is_horror'] = data.index.isin(horror_doc_ids)
def get_all_tokens(documents: List[TokenizedDocument]) -> List[str]:
tokens = {y for x in documents for y in x.tokens}
return sorted(list(tokens))
# CORPUS_PATH=\
# 'https://raw.githubusercontent.com/djp840/MSDS_453_Public/main/MSDS453_ClassCorpus/MSDS453_QA_20220906.csv'
# corpus_df = get_corpus_df(CORPUS_PATH)
# documents = [Document(x, y) for x, y in zip(corpus_df.Doc_ID, corpus_df.Text)]
CORPUS_PATH = '/content/gdrive/MyDrive/Colab Notebooks/MSDS_453_Public-main/MSDS453_ClassCorpus/MSDS453_ClassCorpus_Final_Sec56_v4_20230115.csv'
corpus_df = get_corpus_df(CORPUS_PATH)
documents = [Document(x, y) for x, y in zip(corpus_df.Doc_ID, corpus_df.Text)]
corpus_df.shape
(200, 9)
corpus_df.tail(4).T
| Doc_ID | 292 | 293 | 294 | 295 |
|---|---|---|---|---|
| DSI_Title | TWH_Doc2_PitchBlack | TWH_Doc3_PitchBlack | TWH_Doc4_PitchBlack | TWH_Doc5_PitchBlack |
| Submission File Name | TWH_Doc2_PitchBlack | TWH_Doc3_PitchBlack | TWH_Doc4_PitchBlack | TWH_Doc5_PitchBlack |
| Student Name | TWH | TWH | TWH | TWH |
| Genre of Movie | Sci-Fi | Sci-Fi | Sci-Fi | Sci-Fi |
| Review Type (pos or neg) | Positive | Positive | Positive | Positive |
| Movie Title | Pitch Black | Pitch Black | Pitch Black | Pitch Black |
| Text | It's Vin Diesel week! Suddenly, this largely u... | Practitioner of guilty pleasure B-movies-such ... | Richard B. Riddick (Vin Diesel) can't seem to ... | Science fiction is often described as "a genre... |
| Descriptor | Sci-Fi_Pitch Black_P_292 | Sci-Fi_Pitch Black_P_293 | Sci-Fi_Pitch Black_P_294 | Sci-Fi_Pitch Black_P_295 |
| Doc_ID | 292 | 293 | 294 | 295 |
print(corpus_df.info());
<class 'pandas.core.frame.DataFrame'> Int64Index: 200 entries, 191 to 295 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 DSI_Title 200 non-null object 1 Submission File Name 200 non-null object 2 Student Name 200 non-null object 3 Genre of Movie 200 non-null object 4 Review Type (pos or neg) 200 non-null object 5 Movie Title 200 non-null object 6 Text 200 non-null object 7 Descriptor 200 non-null object 8 Doc_ID 200 non-null int64 dtypes: int64(1), object(8) memory usage: 15.6+ KB None
print(corpus_df['Movie Title'].unique())
['Angel Has Fallen' 'Inception' 'No Time To Die' 'Taken' 'Taxi' 'Despicable Me 3' 'Dirty Grandpa' 'Grown Ups' 'Legally Blonde' 'Lost City' 'Drag me to Hell' 'Fresh' 'It Chapter Two' 'The Toxic Avenger' 'Us' 'Batman' 'Everything Everywhere All at Once' 'Minority Report' 'Oblivion' 'Pitch Black']
counts_df = corpus_df[['Genre of Movie']].copy()
counts_df['Count'] = 1
counts_df.groupby(['Genre of Movie']).count().reset_index()
| Genre of Movie | Count | |
|---|---|---|
| 0 | Action | 50 |
| 1 | Comedy | 50 |
| 2 | Horror | 50 |
| 3 | Sci-Fi | 50 |
corpus_df.columns
Index(['DSI_Title', 'Submission File Name', 'Student Name', 'Genre of Movie',
'Review Type (pos or neg)', 'Movie Title', 'Text', 'Descriptor',
'Doc_ID'],
dtype='object')
corpus_df.groupby(['Genre of Movie', 'Movie Title']).count()['Review Type (pos or neg)']
Genre of Movie Movie Title
Action Angel Has Fallen 10
Inception 10
No Time To Die 10
Taken 10
Taxi 10
Comedy Despicable Me 3 10
Dirty Grandpa 10
Grown Ups 10
Legally Blonde 10
Lost City 10
Horror Drag me to Hell 10
Fresh 10
It Chapter Two 10
The Toxic Avenger 10
Us 10
Sci-Fi Batman 10
Everything Everywhere All at Once 10
Minority Report 10
Oblivion 10
Pitch Black 10
Name: Review Type (pos or neg), dtype: int64
corpus_df.groupby(['Genre of Movie', 'Review Type (pos or neg)']).count()['Movie Title']
Genre of Movie Review Type (pos or neg)
Action Negative 25
Positive 25
Comedy Negative 25
Positive 25
Horror Negative 25
Positive 25
Sci-Fi Negative 25
Positive 25
Name: Movie Title, dtype: int64
normalized_documents = normalize_documents(documents)
normalized_documents[0]
Document(doc_id=191, text='ric roman waugh s angel has fallen sees u s secret service agent mike banning gerard butler having returned to his home turf after a trip across the pond contending with more threats to international security mercifully jettisoning the islamophobia of babak najafi s london has fallen the film finds mike framed for an assassination attempt on president allan trumbull morgan freeman and clearing his name appears to hinge at least at first on whether or not he learns to grapple with the physical and psychological toll of his previous exploits as multiple concussions and spinal injuries have saddled him with insomnia and an addiction to painkillers as a fleet of miniature drones slaughters trumbull s security detail it s impossible not to think of mike s early interactions with his old army ranger buddy wade jennings danny huston the head of a private military contractor angel has fallen introduces wade openly lamenting that trumbull s efforts to stop war profiteering have hurt his business it s almost to the film s credit that jennings s guilt is never in doubt meaning that mike s subsequent escape from custody and quest to clear his name are rooted solely in his having to contend with his traitorous friend s private army and not some dull mystery centered around who set him up still the film s framing of mike as the most wanted man in america is clumsily executed given that his face is plastered on screens all over the country you d think that the man would be trying to avoid public exposure at all times in practice though mike is almost always in plain sight never making any attempt to disguise himself almost as if he s aware that no one ever seems to recognize him the only exception to this rule is when he s held up by two armed backwoods militiamen in angel has fallen s most baffling scene after all when one imagines the sort of people who might be driven to an outraged citizen s arrest over an attack on a liberal black president one doesn t immediately think of white nationalists given the lack of significant impediment to mike s movements the film allows plenty of space for action thrills but waugh seems to have cobbled his set pieces together from a series of close ups edited as if by random selection and because of so much coherence defying shot continuity it s impossible to tell what s happening during any given skirmish even the nonviolent scenes are jittery and aggressive a close up at one point tracks a character picking up a phone with a whip pan so fast that the shot slips out of focus worst of all though are the special effects that mark the more grandiose set pieces with smoke from massive explosions hanging statically in the air as a giant solid mass and in the film s rooftop climax the obvious use of green screen revealing image artifacts around the actors faces it doesn t help that three films into the fallen series mike has almost paradoxically lost some of his dimensionality as a character in an age')
https://www.nltk.org/api/nltk.tokenize.html
Tokenizers divide strings into lists of substrings. For example, tokenizers can be used to find the words and punctuation in a string:
tokenized_documents = tokenize_documents(normalized_documents)
tokenized_documents[0]
TokenizedDocument(doc_id=191, tokens=['ric', 'roman', 'waugh', 's', 'angel', 'has', 'fallen', 'sees', 'u', 's', 'secret', 'service', 'agent', 'mike', 'banning', 'gerard', 'butler', 'having', 'returned', 'to', 'his', 'home', 'turf', 'after', 'a', 'trip', 'across', 'the', 'pond', 'contending', 'with', 'more', 'threats', 'to', 'international', 'security', 'mercifully', 'jettisoning', 'the', 'islamophobia', 'of', 'babak', 'najafi', 's', 'london', 'has', 'fallen', 'the', 'film', 'finds', 'mike', 'framed', 'for', 'an', 'assassination', 'attempt', 'on', 'president', 'allan', 'trumbull', 'morgan', 'freeman', 'and', 'clearing', 'his', 'name', 'appears', 'to', 'hinge', 'at', 'least', 'at', 'first', 'on', 'whether', 'or', 'not', 'he', 'learns', 'to', 'grapple', 'with', 'the', 'physical', 'and', 'psychological', 'toll', 'of', 'his', 'previous', 'exploits', 'as', 'multiple', 'concussions', 'and', 'spinal', 'injuries', 'have', 'saddled', 'him', 'with', 'insomnia', 'and', 'an', 'addiction', 'to', 'painkillers', 'as', 'a', 'fleet', 'of', 'miniature', 'drones', 'slaughters', 'trumbull', 's', 'security', 'detail', 'it', 's', 'impossible', 'not', 'to', 'think', 'of', 'mike', 's', 'early', 'interactions', 'with', 'his', 'old', 'army', 'ranger', 'buddy', 'wade', 'jennings', 'danny', 'huston', 'the', 'head', 'of', 'a', 'private', 'military', 'contractor', 'angel', 'has', 'fallen', 'introduces', 'wade', 'openly', 'lamenting', 'that', 'trumbull', 's', 'efforts', 'to', 'stop', 'war', 'profiteering', 'have', 'hurt', 'his', 'business', 'it', 's', 'almost', 'to', 'the', 'film', 's', 'credit', 'that', 'jennings', 's', 'guilt', 'is', 'never', 'in', 'doubt', 'meaning', 'that', 'mike', 's', 'subsequent', 'escape', 'from', 'custody', 'and', 'quest', 'to', 'clear', 'his', 'name', 'are', 'rooted', 'solely', 'in', 'his', 'having', 'to', 'contend', 'with', 'his', 'traitorous', 'friend', 's', 'private', 'army', 'and', 'not', 'some', 'dull', 'mystery', 'centered', 'around', 'who', 'set', 'him', 'up', 'still', 'the', 'film', 's', 'framing', 'of', 'mike', 'as', 'the', 'most', 'wanted', 'man', 'in', 'america', 'is', 'clumsily', 'executed', 'given', 'that', 'his', 'face', 'is', 'plastered', 'on', 'screens', 'all', 'over', 'the', 'country', 'you', 'd', 'think', 'that', 'the', 'man', 'would', 'be', 'trying', 'to', 'avoid', 'public', 'exposure', 'at', 'all', 'times', 'in', 'practice', 'though', 'mike', 'is', 'almost', 'always', 'in', 'plain', 'sight', 'never', 'making', 'any', 'attempt', 'to', 'disguise', 'himself', 'almost', 'as', 'if', 'he', 's', 'aware', 'that', 'no', 'one', 'ever', 'seems', 'to', 'recognize', 'him', 'the', 'only', 'exception', 'to', 'this', 'rule', 'is', 'when', 'he', 's', 'held', 'up', 'by', 'two', 'armed', 'backwoods', 'militiamen', 'in', 'angel', 'has', 'fallen', 's', 'most', 'baffling', 'scene', 'after', 'all', 'when', 'one', 'imagines', 'the', 'sort', 'of', 'people', 'who', 'might', 'be', 'driven', 'to', 'an', 'outraged', 'citizen', 's', 'arrest', 'over', 'an', 'attack', 'on', 'a', 'liberal', 'black', 'president', 'one', 'doesn', 't', 'immediately', 'think', 'of', 'white', 'nationalists', 'given', 'the', 'lack', 'of', 'significant', 'impediment', 'to', 'mike', 's', 'movements', 'the', 'film', 'allows', 'plenty', 'of', 'space', 'for', 'action', 'thrills', 'but', 'waugh', 'seems', 'to', 'have', 'cobbled', 'his', 'set', 'pieces', 'together', 'from', 'a', 'series', 'of', 'close', 'ups', 'edited', 'as', 'if', 'by', 'random', 'selection', 'and', 'because', 'of', 'so', 'much', 'coherence', 'defying', 'shot', 'continuity', 'it', 's', 'impossible', 'to', 'tell', 'what', 's', 'happening', 'during', 'any', 'given', 'skirmish', 'even', 'the', 'nonviolent', 'scenes', 'are', 'jittery', 'and', 'aggressive', 'a', 'close', 'up', 'at', 'one', 'point', 'tracks', 'a', 'character', 'picking', 'up', 'a', 'phone', 'with', 'a', 'whip', 'pan', 'so', 'fast', 'that', 'the', 'shot', 'slips', 'out', 'of', 'focus', 'worst', 'of', 'all', 'though', 'are', 'the', 'special', 'effects', 'that', 'mark', 'the', 'more', 'grandiose', 'set', 'pieces', 'with', 'smoke', 'from', 'massive', 'explosions', 'hanging', 'statically', 'in', 'the', 'air', 'as', 'a', 'giant', 'solid', 'mass', 'and', 'in', 'the', 'film', 's', 'rooftop', 'climax', 'the', 'obvious', 'use', 'of', 'green', 'screen', 'revealing', 'image', 'artifacts', 'around', 'the', 'actors', 'faces', 'it', 'doesn', 't', 'help', 'that', 'three', 'films', 'into', 'the', 'fallen', 'series', 'mike', 'has', 'almost', 'paradoxically', 'lost', 'some', 'of', 'his', 'dimensionality', 'as', 'a', 'character', 'in', 'an', 'age'])
titles_by_doc_ids = {x: y for x, y in zip(corpus_df['Doc_ID'], corpus_df['Movie Title'])}
genres_by_doc_ids = {x: y for x, y in zip(corpus_df['Doc_ID'], corpus_df['Genre of Movie'])}
descriptors_by_doc_ids = {x: y for x, y in zip(corpus_df['Doc_ID'], corpus_df['Descriptor'])}
horror_doc_ids = [int(x) for x in corpus_df['Doc_ID'] if genres_by_doc_ids[x] == 'Horror']
horror_documents = [x for x in documents if x.doc_id in horror_doc_ids]
non_horror_doc_ids = {int(x) for x in corpus_df['Doc_ID'] if genres_by_doc_ids[x] != 'Horror'}
non_horror_documents = [x for x in documents if x.doc_id in non_horror_doc_ids]
print(corpus_df['Movie Title'].unique())
['Angel Has Fallen' 'Inception' 'No Time To Die' 'Taken' 'Taxi' 'Despicable Me 3' 'Dirty Grandpa' 'Grown Ups' 'Legally Blonde' 'Lost City' 'Drag me to Hell' 'Fresh' 'It Chapter Two' 'The Toxic Avenger' 'Us' 'Batman' 'Everything Everywhere All at Once' 'Minority Report' 'Oblivion' 'Pitch Black']
us_doc_ids = [int(x) for x in corpus_df['Doc_ID'] if titles_by_doc_ids[x] == 'Us']
us_documents = [x for x in documents if x.doc_id in us_doc_ids]
candidate_terms = [
'horror',
'wilsons',
'family',
'adelaide',
'get out',
'peele',
'nyong',
'vacation',
'doppleganger',
'film',
'park',
'california',
'monsters',
'beach',
'scary',
'escape',
'shadow',
'us',
'boat',
'scissors'
]
vectorizer = CountVectorizer(ngram_range=(1, 1))
text_for_counts = [x.text for x in normalized_documents]
matrix = vectorizer.fit_transform(text_for_counts)
words = vectorizer.get_feature_names_out()
word_counts = pd.DataFrame(matrix.toarray(), columns=words, index=corpus_df.Doc_ID)
add_flags(word_counts, us_doc_ids, horror_doc_ids)
word_counts['Doc_ID'] = word_counts.index
# Collect result into a dataframe
mean_frequencies = pd.DataFrame(index=candidate_terms)
us_mean_frequencies = word_counts[word_counts.is_us][[x for x in candidate_terms if x in word_counts.columns]].mean()
mean_frequencies['Us'] = us_mean_frequencies
horror_mean_frequencies = word_counts[word_counts.is_horror][[x for x in candidate_terms if x in word_counts.columns]].mean()
mean_frequencies['All Horror'] = horror_mean_frequencies
non_horror_mean_frequencies = word_counts[~word_counts.is_horror][[x for x in candidate_terms if x in word_counts.columns]].mean()
mean_frequencies['All Non-Horror'] = non_horror_mean_frequencies
mean_frequencies.fillna(0.0).sort_values(['Us'], ascending=False)
| Us | All Horror | All Non-Horror | |
|---|---|---|---|
| peele | 4.3 | 0.88 | 0.000000 |
| us | 4.2 | 1.10 | 0.306667 |
| family | 3.1 | 0.64 | 0.366667 |
| horror | 2.7 | 1.76 | 0.080000 |
| adelaide | 2.7 | 0.54 | 0.000000 |
| film | 1.9 | 2.34 | 2.586667 |
| nyong | 1.5 | 0.30 | 0.000000 |
| wilsons | 1.5 | 0.30 | 0.000000 |
| beach | 0.9 | 0.18 | 0.033333 |
| vacation | 0.8 | 0.18 | 0.033333 |
| boat | 0.5 | 0.10 | 0.013333 |
| shadow | 0.5 | 0.14 | 0.013333 |
| scissors | 0.5 | 0.10 | 0.000000 |
| california | 0.4 | 0.18 | 0.066667 |
| park | 0.4 | 0.10 | 0.060000 |
| scary | 0.3 | 0.18 | 0.026667 |
| monsters | 0.2 | 0.08 | 0.040000 |
| escape | 0.2 | 0.10 | 0.100000 |
| doppleganger | 0.0 | 0.00 | 0.000000 |
| get out | 0.0 | 0.00 | 0.000000 |
# These are identified
important_prevalent_terms = [
'horror',
'us',
'family',
'film'
]
# stemmer = PorterStemmer()
# stemmed_important_prevalent_terms = [stemmer.stem(x) for x in important_prevalent_terms]
pd.options.display.float_format = '{:,.2f}'.format
mean_frequencies.fillna(0.0).loc[important_prevalent_terms].round(2).sort_values(['Us'], ascending=False)
| Us | All Horror | All Non-Horror | |
|---|---|---|---|
| us | 4.20 | 1.10 | 0.31 |
| family | 3.10 | 0.64 | 0.37 |
| horror | 2.70 | 1.76 | 0.08 |
| film | 1.90 | 2.34 | 2.59 |
def run_tfidf(documents: List[Document],
clean_func: Callable[[List[Document]], List[TokenizedDocument]],
important_prevalent_terms: List[str],
experiment_name: str,
output_tfidf_vectors: bool=False,
output_vocabulary: bool=True):
cleaned_documents = clean_func(documents)
cleaned_document_text = [' '.join(x.tokens) for x in cleaned_documents]
vectorizer = TfidfVectorizer(use_idf=True,
ngram_range=(1, 1),
norm=None)
transformed_documents = vectorizer.fit_transform(cleaned_document_text)
transformed_documents_as_array = transformed_documents.toarray()
output_dir = f'output/{experiment_name}_Results'
if not os.path.exists(output_dir):
os.makedirs(output_dir)
if output_tfidf_vectors:
for counter, doc in enumerate(transformed_documents_as_array):
tf_idf_tuples = list(zip(vectorizer.get_feature_names_out(), doc))
one_doc_as_df = pd.DataFrame.from_records(tf_idf_tuples, columns=['term', 'score'])\
.sort_values(by='score', ascending=False)\
.reset_index(drop=True)
one_doc_as_df.to_csv(f'{output_dir}/{corpus_df["Submission File Name"][counter]}')
if output_vocabulary:
with open(f'{output_dir}/vocabulary.txt', 'w') as vocab:
words = sorted(vectorizer.get_feature_names_out())
print('\n'.join(words), file=vocab)
# Create document-term dataframe
doc_term_matrix = transformed_documents.todense()
doc_term_df = pd.DataFrame(doc_term_matrix,
columns=vectorizer.get_feature_names_out(),
index=corpus_df.Doc_ID)
add_flags(doc_term_df, us_doc_ids, horror_doc_ids)
# Print the top 10 mean TF-IDF values
top10_tfidf = pd.DataFrame(doc_term_df.mean().sort_values(ascending=False).head(10))
top10_tfidf.rename(columns={0: 'Mean TF-IDF'}, inplace=True)
display(top10_tfidf)
# Collect result into a dataframe
tfidf_results = pd.DataFrame(index=important_prevalent_terms)
all_tfidf_results = doc_term_df[[x for x in important_prevalent_terms if x in doc_term_df.columns]].mean().round(2)
tfidf_results['All Movies'] = all_tfidf_results
plt.hist(doc_term_df.mean(), 100, range=(0, 8))
print(f'Vocabulary size: {doc_term_df.shape[1]}')
descriptors = corpus_df['Descriptor']
# similarities = cosine_similarity(doc_term_df.loc[horror_doc_ids], doc_term_df.loc[horror_doc_ids])
similarities = cosine_similarity(doc_term_df, doc_term_df)
fig, ax = plt.subplots(figsize=(30, 30))
labels = [descriptors_by_doc_ids[x.doc_id] for x in documents]
sns.heatmap(ax=ax, data=similarities, xticklabels=labels, yticklabels=labels)
plt.savefig(f'/content/gdrive/MyDrive/figures/{experiment_name}_heatmap_documents.png')
plt.show()
def clean_method(documents: List[Document]) -> List[TokenizedDocument]:
"""
Normalizes text, tokenizes, lemmatizes, and removes stop words.
"""
documents = normalize_documents(documents)
documents = tokenize_documents(documents)
# documents = lemmatize(documents)
# documents = remove_stop_words(documents)
# documents = stem(documents)
return documents
run_tfidf(documents, clean_method, important_prevalent_terms, 'TFIDF_exp_BL')
| Mean TF-IDF | |
|---|---|
| the | 27.39 |
| and | 14.05 |
| of | 13.50 |
| to | 13.01 |
| in | 8.20 |
| is | 7.92 |
| that | 6.25 |
| it | 5.83 |
| as | 4.46 |
| with | 4.43 |
Vocabulary size: 12159
def get_word2vec_vectors(documents: List[TokenizedDocument], embedding_size: int) -> pd.DataFrame:
tokens = [x.tokens for x in documents]
word2vec_model = Word2Vec(tokens, size=embedding_size, window=3, min_count=1, workers=12)
vectors = {}
for i in word2vec_model.wv.vocab:
temp_vec = word2vec_model.wv[i]
vectors[i] = temp_vec
result = pd.DataFrame(vectors).transpose()
result = result.sort_index()
return result
def plot_similarity_matrix(data: pd.DataFrame, experiment_name: str, figsize=(25, 25)):
similarities = cosine_similarity(data, data)
fig, ax = plt.subplots(figsize=figsize)
sns.heatmap(ax=ax, data=similarities, xticklabels=data.index, yticklabels=data.index);
plt.savefig(f'/content/gdrive/MyDrive/figures/{experiment_name}_heatmap.png')
plt.close()
def plot_similarity_clustermap(data: pd.DataFrame, experiment_name: str, figsize=(25, 25)):
similarities = cosine_similarity(data, data)
cm = sns.clustermap(similarities, metric='cosine', xticklabels=data.index, yticklabels=data.index, method='complete', cmap='RdBu', figsize=figsize)
cm.ax_row_dendrogram.set_visible(False)
cm.ax_col_dendrogram.set_visible(False)
plt.savefig(f'/content/gdrive/MyDrive/figures/{experiment_name}_clustermap.png')
plt.legend(loc='upper left')
plt.show()
plt.close()
def plot_tsne(data: pd.DataFrame, perplexity: int, experiment_name: str, figsize=(40, 40)):
"""
Creates a TSNE plot of the supplied dataframe
"""
tsne_model = TSNE(perplexity=perplexity, n_components=2, learning_rate='auto', init='pca', n_iter=1000, random_state=32)
new_values = tsne_model.fit_transform(data)
x = []
y = []
for value in new_values:
x.append(value[0])
y.append(value[1])
plt.figure(figsize=figsize)
labels = list(data.index)
for i in range(len(x)):
new_value = new_values[i]
x = new_value[0]
y = new_value[1]
plt.scatter(x, y)
plt.annotate(labels[i],
xy=(x, y),
xytext=(5, 2),
textcoords='offset points',
ha='right',
va='bottom')
plt.savefig(f'/content/gdrive/MyDrive/figures/{experiment_name}_tsne.png')
plt.show()
plt.close()
def run_word2vec_experiment(documents: List[Document],
clean_func: Callable[[List[Document]], List[TokenizedDocument]],
embedding_size: int,
chosen_tokens: List[str],
experiment_name: str):
cleaned_documents = clean_func(documents)
word2vec_df = get_word2vec_vectors(cleaned_documents, embedding_size)
filtered_word2vec_df = word2vec_df.loc[chosen_tokens].copy()
plot_tsne(filtered_word2vec_df, 30, experiment_name)
plot_similarity_matrix(filtered_word2vec_df, experiment_name)
plot_similarity_clustermap(filtered_word2vec_df, experiment_name)
extra_terms = [
'horror',
'vacation',
'scary',
'mother',
'shadow'
]
# Get our terms to examine in experiements 4-12
all_tokens = get_all_tokens(clean_method(documents))
chosen_tokens = random.choices(all_tokens, k=100 - len(extra_terms)) + extra_terms
# lemmatizer = WordNetLemmatizer()
# lemmatized_chosen_tokens = [lemmatizer.lemmatize(x) for x in chosen_tokens]
# stemmed_chosen_tokens = [stemmer.stem(x) for x in lemmatized_chosen_tokens]
run_word2vec_experiment(documents, clean_method, 100, chosen_tokens, 'BL_Word2Vec_100')
WARNING:gensim.models.base_any2vec:under 10 jobs per worker: consider setting a smaller `batch_words' for smoother alpha decay
WARNING:matplotlib.legend:No handles with labels found to put in legend.
def run_doc2vec(documents: List[TokenizedDocument], embedding_size: int, descriptors_by_doc_ids: Dict[int, str]):
tagged_documents = [TaggedDocument(document.tokens, [i]) for i, document in enumerate(documents)]
doc2vec_model = Doc2Vec(tagged_documents, vector_size=embedding_size, window=3, min_count=2, workers=12)
doc2vec_df = pd.DataFrame()
for document in documents:
vector = pd.DataFrame(doc2vec_model.infer_vector(document.tokens)).transpose()
doc2vec_df = pd.concat([doc2vec_df, vector], axis=0)
doc2vec_df['Descriptor'] = [descriptors_by_doc_ids[x.doc_id] for x in documents]
doc2vec_df.set_index(['Descriptor'], inplace=True)
return doc2vec_df
def run_doc2vec_experiment(documents: List[Document],
clean_func: Callable[[List[Document]], List[TokenizedDocument]],
embedding_size: int,
experiment_name: str):
cleaned_documents = clean_func(documents)
doc2vec_df = run_doc2vec(cleaned_documents, embedding_size, descriptors_by_doc_ids)
plot_similarity_matrix(doc2vec_df, experiment_name)
plot_similarity_clustermap(doc2vec_df, experiment_name, figsize=(50, 50))
plot_tsne(doc2vec_df, 30, experiment_name)
run_doc2vec_experiment(documents, clean_method, 100, 'BL_Doc2Vec_100')
WARNING:gensim.models.base_any2vec:under 10 jobs per worker: consider setting a smaller `batch_words' for smoother alpha decay WARNING:matplotlib.legend:No handles with labels found to put in legend.
run_word2vec_experiment(documents, clean_method, 200, chosen_tokens, 'BL_Word2Vec_200')
WARNING:gensim.models.base_any2vec:under 10 jobs per worker: consider setting a smaller `batch_words' for smoother alpha decay
WARNING:matplotlib.legend:No handles with labels found to put in legend.
run_doc2vec_experiment(documents, clean_method, 200, 'BL_Doc2Vec_200')
WARNING:gensim.models.base_any2vec:under 10 jobs per worker: consider setting a smaller `batch_words' for smoother alpha decay WARNING:matplotlib.legend:No handles with labels found to put in legend.
run_word2vec_experiment(documents, clean_method, 300, chosen_tokens, 'BL_Word2Vec_300')
WARNING:gensim.models.base_any2vec:under 10 jobs per worker: consider setting a smaller `batch_words' for smoother alpha decay
WARNING:matplotlib.legend:No handles with labels found to put in legend.
run_doc2vec_experiment(documents, clean_method, 300, 'BL_Doc2Vec_300')
WARNING:gensim.models.base_any2vec:under 10 jobs per worker: consider setting a smaller `batch_words' for smoother alpha decay WARNING:matplotlib.legend:No handles with labels found to put in legend.